In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(precision=2)

Text Feature Extraction with Bag-of-Words


In [ ]:
X = ["Some say the world will end in fire,",
     "Some say in ice."]

In [ ]:
len(X)

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X)
vectorizer.vocabulary_

In [ ]:
X_bag_of_words = vectorizer.transform(X)
X_bag_of_words

In [ ]:
print(X_bag_of_words.toarray())

In [ ]:
print(X)
vectorizer.get_feature_names()

In [ ]:
vectorizer.inverse_transform(X_bag_of_words)

Tfidf Encoding


In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X)

print(tfidf_vectorizer.get_feature_names())
print(tfidf_vectorizer.transform(X).toarray())

Bigrams and N-Grams


In [ ]:
X

In [ ]:
# look at sequences of tokens of minimum length 2 and maximum length 2
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_vectorizer.fit(X)
bigram_vectorizer.get_feature_names()

In [ ]:
bigram_vectorizer.transform(X).toarray()

In [ ]:
gram_vectorizer = CountVectorizer(ngram_range=(1, 2))
gram_vectorizer.fit(X)

gram_vectorizer.get_feature_names()

In [ ]:
X_1_2_gram = gram_vectorizer.transform(X)
print(X_1_2_gram.shape)
print(X_1_2_gram.toarray())

Character n-grams


In [ ]:
char_vectorizer = CountVectorizer(ngram_range=(2, 3), analyzer="char")
char_vectorizer.fit(X)
print(char_vectorizer.get_feature_names())